library(lmtest) # Dw test
library(ggplot2)
library(lubridate) # 날짜 다루는 패키지
해당 자료는 전북대학교 이영미 교수님 2023고급시계열분석 자료임
패키지 설치
options(repr.plot.width = 15, repr.plot.height = 8)
국내총인구
<- scan("population.txt")
z head(z)
- 25012374
- 25765673
- 26513030
- 27261747
- 27984155
- 28704674
= round(z/10000)
pop head(pop)
- 2501
- 2577
- 2651
- 2726
- 2798
- 2870
<- data.frame(
tmp.data day = seq(ymd("1960-01-01"),by='year', length.out=length(z)), #60년 1월 1일부터 연단위로, 데이터길이는 z와 동일하게.
#day = 1959 + 1:length(z),
pop = round(z/10000),
t = 1:length(z),
t2 = (1:length(z))^2
)head(tmp.data)
day | pop | t | t2 | |
---|---|---|---|---|
<date> | <dbl> | <int> | <dbl> | |
1 | 1960-01-01 | 2501 | 1 | 1 |
2 | 1961-01-01 | 2577 | 2 | 4 |
3 | 1962-01-01 | 2651 | 3 | 9 |
4 | 1963-01-01 | 2726 | 4 | 16 |
5 | 1964-01-01 | 2798 | 5 | 25 |
6 | 1965-01-01 | 2870 | 6 | 36 |
시도표그리기
ggplot(tmp.data, aes(day, pop)) +
geom_line(col='skyblue', lwd=3) +
geom_point(col='steelblue', cex=3)+
theme_bw() +
#scale_x_date(date_labels = "%Y-%m") +
theme(axis.title=element_blank(), # ggplot 배경 변경
axis.text= element_text(size=20))
ggplot(tmp.data, aes(day, pop)) +
geom_line(col='skyblue', lwd=3) +
geom_point(col='steelblue', cex=3)+
xlab("time") + ylab("Population")+
theme_bw() +
#scale_x_date(date_labels = "%Y-%m") +
theme(axis.text= element_text(size=20),
axis.title= element_text(size=30))
1차 선형 추세 모형
\[\text{모형}: Z_t = \beta_0 + \beta_1 t + \epsilon_t, t=1,\dots, n\]
<- lm(pop~t, data=tmp.data)
m1 summary(m1)
Call:
lm(formula = pop ~ t, data = tmp.data)
Residuals:
Min 1Q Median 3Q Max
-115.40 -48.30 16.87 54.37 63.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2559.3889 20.0385 127.72 <2e-16 ***
t 57.0135 0.9444 60.37 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 58.87 on 34 degrees of freedom
Multiple R-squared: 0.9908, Adjusted R-squared: 0.9905
F-statistic: 3644 on 1 and 34 DF, p-value: < 2.2e-16
plot(pop~day, tmp.data,
main = 'Population : observation vs. fitted value',
xlab="", ylab="",
type='l',
col='skyblue',
lwd=2, cex.axis=2, cex.main=2) +
points(pop~day, tmp.data, col="steelblue", cex=2, pch=16) +
lines(tmp.data$day, fitted(m1), col='red', lty=2, lwd=2)
잔차분석
plot(tmp.data$day, resid(m1),
pch=16, cex=2, xaxt='n',
xlab="", ylab="", main="residual plot", cex.main=2)
abline(h=0, lty=2, lwd=2)
독립성검정(DW test)
dwtest(m1)
Durbin-Watson test
data: m1
DW = 0.041645, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
- 아무 옵션이 없으면 0보다 크냐는게 기본!! 대립가설확인필요
dwtest(m1, alternative="two.sided")
Durbin-Watson test
data: m1
DW = 0.041645, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is not 0
- 양측검정
dwtest(m1, alternative="less")
Durbin-Watson test
data: m1
DW = 0.041645, p-value = 1
alternative hypothesis: true autocorrelation is less than 0
정규분포 검정(shapro-wilk test)
-
가설
\(H_0\): 정규분포를 따른다. VS \(H_1\): 정규분포를 따르지 않는다.
hist(resid(m1))
- 왼쪽으로 치우쳐져 있는 그림. 정규분포가 아닐거 같다.
shapiro.test(resid(m1)) #H_0 : 정규분포를 따른다.
Shapiro-Wilk normality test
data: resid(m1)
W = 0.87284, p-value = 0.000669
등분산성검정(Breusch–Pagan test)
-
가설
\(H_0\): 등분산 VS \(H_1\): 이분산
bptest(m1)
studentized Breusch-Pagan test
data: m1
BP = 0.0059664, df = 1, p-value = 0.9384
잔차에 대한 bptest를 한다. 위에 shapiro는 resid(m1)해줬지만 bptest는 그냥 바로 넣어 주면 된다.
pvalue값이 엄청 커서 기각 못함. 즉 등분산이다.
2차 선형 추세
\[\text{모형}: Z_t = \beta_0 + \beta_1 t + \beta_2 t^2 + \epsilon_t, t=1,\dots, n\]
<- lm(pop~t+t2, data=tmp.data)
m2 summary(m2)
Call:
lm(formula = pop ~ t + t2, data = tmp.data)
Residuals:
Min 1Q Median 3Q Max
-11.365 -4.779 -1.049 3.798 17.631
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2421.49090 4.05820 596.69 <2e-16 ***
t 78.78688 0.50576 155.78 <2e-16 ***
t2 -0.58847 0.01326 -44.38 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.67 on 33 degrees of freedom
Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
F-statistic: 1.083e+05 on 2 and 33 DF, p-value: < 2.2e-16
- t2말고 \(I(t^2)\)으로 작성해줘도 된다. 대신 \(t^2\)으로 하게 되면 오류가 나니까 I지시함수 꼭 써주기
plot(pop~day, tmp.data,
main = 'Population : observation vs. fitted value',
xlab="", ylab="",
type='l',
col='skyblue',
lwd=2, cex.axis=2, cex.main=2) +
points(pop~day, tmp.data, col="steelblue", cex=2, pch=16) +
lines(tmp.data$day, fitted(m2), col='red', lty=2, lwd=2)
잔차분석
plot(tmp.data$day, resid(m2),
pch=16, cex=2, xaxt='n',
xlab="", ylab="", main="residual plot", cex.main=2)
abline(h=0, lty=2, lwd=2)
독립성 검정(DW test)
dwtest(m2,alternative = "two.sided")
Durbin-Watson test
data: m2
DW = 0.31083, p-value = 1.744e-13
alternative hypothesis: true autocorrelation is not 0
dwtest(m2,alternative = "greater")
Durbin-Watson test
data: m2
DW = 0.31083, p-value = 8.72e-14
alternative hypothesis: true autocorrelation is greater than 0
정규성 검정
qqnorm(resid(m2), pch=16, cex=2)
qqline(resid(m2), col = 2, lwd=2)
hist(resid(m2))
shapiro.test(resid(m2)) ##shapiro-wilk test
Shapiro-Wilk normality test
data: resid(m2)
W = 0.94947, p-value = 0.1007
등분산성 검정
bptest(m2)
studentized Breusch-Pagan test
data: m2
BP = 8.2455, df = 2, p-value = 0.0162
- 이분산성이다.
로그변환 후 2차 추세
\[\text{모형}: ln(Z_t) = \beta_0 + \beta_1 t + \beta_2 t^2 + \epsilon_t, t=1,\dots, n\]
<- lm(log(pop)~t+t2, data=tmp.data)
m3 summary(m3)
Call:
lm(formula = log(pop) ~ t + t2, data = tmp.data)
Residuals:
Min 1Q Median 3Q Max
-0.009306 -0.003520 -0.000374 0.003284 0.010159
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.807e+00 2.360e-03 3307.25 <2e-16 ***
t 2.740e-02 2.942e-04 93.14 <2e-16 ***
t2 -3.004e-04 7.712e-06 -38.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.004461 on 33 degrees of freedom
Multiple R-squared: 0.9994, Adjusted R-squared: 0.9993
F-statistic: 2.664e+04 on 2 and 33 DF, p-value: < 2.2e-16
plot(log(pop)~day, tmp.data,
main = 'Population : observation vs. fitted value',
xlab="", ylab="",
type='l',
col='skyblue',
lwd=2, cex.axis=2, cex.main=2) +
points(log(pop)~day, tmp.data, col="steelblue", cex=2, pch=16) +
lines(tmp.data$day, fitted(m3), col='red', lty=2, lwd=2)
plot(pop~day, tmp.data,
main = 'Population : observation vs. fitted value',
xlab="", ylab="",
type='l',
col='skyblue',
lwd=2, cex.axis=2, cex.main=2) +
points(pop~day, tmp.data, col="steelblue", cex=2, pch=16) +
lines(tmp.data$day, exp(fitted(m3)), col='red', lty=2, lwd=2)
잔차분석
plot(tmp.data$day, resid(m3),
pch=16, cex=2, xaxt='n',
xlab="", ylab="", main="residual plot", cex.main=2)
abline(h=0, lty=2, lwd=2)
독립성 검정
dwtest(m3,alternative = "two.sided")
Durbin-Watson test
data: m3
DW = 0.16493, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is not 0
dwtest(m3,alternative = "greater")
Durbin-Watson test
data: m3
DW = 0.16493, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
정규성 검정
qqnorm(resid(m2), pch=16, cex=2)
qqline(resid(m2), col = 2, lwd=2)
hist(resid(m3))
shapiro.test(resid(m3))
Shapiro-Wilk normality test
data: resid(m3)
W = 0.97547, p-value = 0.5922
등분산성 검정
bptest(m3)
studentized Breusch-Pagan test
data: m3
BP = 8.8866, df = 2, p-value = 0.01176